In [1]:
# パッケージをインストール
%pip install -qe ..
[notice] A new release of pip is available: 24.0 -> 25.1.1 [notice] To update, run: pip install --upgrade pip Note: you may need to restart the kernel to use updated packages.
In [2]:
import duckdb
import japanize_matplotlib # noqa: F401
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
In [3]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
In [5]:
In [4]:
plt.style.use("ggplot")
In [ ]:
In [5]:
with duckdb.connect("../data/raw/etl_from_sf.duckdb", read_only=True) as con:
df = con.execute("SELECT * FROM raw_clean_with_pool").df()
df.head()
Out[5]:
| hour_ts | id | volume_usd | tvl_usd | liquidity | volume_token0 | volume_token1 | fees_usd | open_price | high_price | ... | load_ts | fee_tier | token0_id | token0_symbol | token0_name | token0_decimals | token1_id | token1_symbol | token1_name | token1_decimals | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1746784800 | 0xac8ba9d6fa2f9ff7c712bef5670cb0f7e4177965-485218 | 698.000000 | 2.711293e+05 | 3.112152e+17 | 6346.977788 | 698.000000 | 2.094000 | 9.059070 | 9.059070 | ... | 2025-05-09 11:00:09.148000+00:00 | 3000 | 0x2d8ea194902bc55431420bd26be92b0782dce91d | ZND | ZNDToken | 18 | 0xdac17f958d2ee523a2206206994597c13d831ec7 | USDT | Tether USD | 6 |
| 1 | 1745672400 | 0xfa6e8e97ececdc36302eca534f63439b1e79487b-484909 | 302.294707 | 5.368977e+04 | 2.713280e+13 | 302.346914 | 302.242500 | 0.030229 | 1.000449 | 1.000449 | ... | 2025-05-09 11:00:22.403000+00:00 | 100 | 0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48 | USDC | USD Coin | 6 | 0xdac17f958d2ee523a2206206994597c13d831ec7 | USDT | Tether USD | 6 |
| 2 | 1745672400 | 0x7ce04536279a74951c1a0148b5488d8a027a9bf8-484909 | 6.605206 | 2.190198e+02 | 4.443265e+19 | 82.062242 | 0.003694 | 0.066052 | 22165.049401 | 22165.049401 | ... | 2025-05-09 11:00:22.403000+00:00 | 10000 | 0x419c4db4b9e25d6db2ad9691ccb832c8d9fda05e | DRGN | Dragon | 18 | 0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2 | WETH | Wrapped Ether | 18 |
| 3 | 1744063200 | 0x0598690ad1daffc3ac47fd6adcaeac547c067e26-484462 | 0.000000 | 3.658733e+06 | 7.049047e+26 | 0.000000 | 0.000000 | 0.000000 | 1.007475 | 1.007475 | ... | 2025-05-09 11:51:11.735000+00:00 | 3000 | 0x57e114b691db790c35207b2e685d4a43181e6061 | ENA | ENA | 18 | 0x8be3460a480c80728a8c4d7a5d5303c85ba7b3b9 | sENA | Staked ENA | 18 |
| 4 | 1744063200 | 0x6288694eb218614a27777f2b52d3f8d4819233c0-484462 | 1697.111986 | 3.920453e+03 | 6.054509e+20 | 79772.446480 | 1.080806 | 16.971120 | 78952.973584 | 81377.447456 | ... | 2025-05-09 11:51:11.735000+00:00 | 10000 | 0xb1d1eae60eea9525032a6dcb4c1ce336a1de71be | DRV | Derive | 18 | 0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2 | WETH | Wrapped Ether | 18 |
5 rows × 28 columns
In [6]:
raw_clean_with_pool = df
In [7]:
# build_raw_clean_table("../data/raw", "../data/raw/etl_raw_2.duckdb")
In [8]:
# con = duckdb.connect(database="../data/raw/etl_raw_2.duckdb")
# raw_clean_with_pool = con.execute("SELECT * FROM raw_clean_with_pool").df()
# raw_clean_with_pool.head()
データ変換¶
In [9]:
# タイムスタンプを日時に変換
raw_clean_with_pool["datetime"] = pd.to_datetime(raw_clean_with_pool["hour_ts"], unit="s")
In [10]:
# IDからプールアドレスとインデックスを抽出
raw_clean_with_pool["pool_address"] = raw_clean_with_pool["id"].str.split("-").str[0]
raw_clean_with_pool["block_index"] = raw_clean_with_pool["id"].str.split("-").str[1]
In [11]:
# 重複確認と除去
print(f"重複行数: {raw_clean_with_pool.duplicated().sum()}")
raw_clean_with_pool = raw_clean_with_pool.drop_duplicates()
print(f"重複除去後の行数: {raw_clean_with_pool.shape[0]}")
重複行数: 0 重複除去後の行数: 134736
In [12]:
# フィーティアごとのプール数
fee_tier_counts = raw_clean_with_pool["fee_tier"].value_counts().reset_index()
fee_tier_counts.columns = ["fee_tier", "count"]
In [13]:
# 基本統計量の確認
raw_clean_with_pool.describe()
Out[13]:
| hour_ts | volume_usd | tvl_usd | liquidity | volume_token0 | volume_token1 | fees_usd | open_price | high_price | low_price | close_price | tx_count | tick | sqrt_price | token0_price | token1_price | datetime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.347360e+05 | 1.347360e+05 | 1.347360e+05 | 1.347360e+05 | 1.347360e+05 | 1.347360e+05 | 134736.000000 | 1.347360e+05 | 1.347360e+05 | 1.347360e+05 | 1.347360e+05 | 134736.000000 | 134734.000000 | 1.347360e+05 | 1.347360e+05 | 1.347360e+05 | 134736 |
| mean | 1.743539e+09 | 5.510828e+04 | 4.231219e+06 | 2.937187e+30 | 1.114310e+16 | 1.325911e+11 | 56.812143 | 2.525362e+51 | 2.525367e+51 | 2.525362e+51 | 2.525367e+51 | 7.497966 | -58742.235197 | 3.037088e+44 | 2.525367e+51 | 2.553138e+45 | 2025-04-01 20:28:27.766298624 |
| min | 1.741126e+09 | 0.000000e+00 | -1.122805e+05 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000 | -887272.000000 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 2025-03-04 22:00:00 |
| 25% | 1.741748e+09 | 2.596662e+01 | 2.170830e+04 | 3.601585e+17 | 8.994876e+00 | 1.078082e-01 | 0.087213 | 4.790029e-01 | 4.972653e-01 | 4.747034e-01 | 4.940731e-01 | 1.000000 | -139830.250000 | 7.285223e+25 | 4.940731e-01 | 3.572542e-05 | 2025-03-12 03:00:00 |
| 50% | 1.743001e+09 | 2.410914e+02 | 2.090232e+05 | 1.181802e+21 | 9.500936e+02 | 4.329109e+00 | 0.983544 | 1.169327e+02 | 1.196488e+02 | 1.153471e+02 | 1.183872e+02 | 2.000000 | -79293.500000 | 1.503591e+27 | 1.183872e+02 | 8.182775e-03 | 2025-03-26 15:00:00 |
| 75% | 1.745255e+09 | 1.910617e+03 | 1.180628e+06 | 3.255444e+22 | 3.629037e+04 | 1.129278e+03 | 7.956426 | 2.705184e+04 | 2.768699e+04 | 2.674672e+04 | 2.742095e+04 | 5.000000 | 34735.250000 | 4.496508e+29 | 2.742095e+04 | 1.934087e+00 | 2025-04-21 17:00:00 |
| max | 1.746853e+09 | 1.432742e+08 | 4.263057e+08 | 1.833394e+34 | 1.277522e+21 | 7.598498e+15 | 49084.157146 | 3.402568e+56 | 3.402568e+56 | 3.402568e+56 | 3.402568e+56 | 1065.000000 | 887271.000000 | 1.461447e+48 | 3.402568e+56 | 3.402568e+50 | 2025-05-10 05:00:00 |
| std | 1.844010e+06 | 8.091992e+05 | 2.493916e+07 | 2.302294e+32 | 3.490701e+18 | 2.538906e+13 | 578.628932 | 9.269685e+53 | 9.269685e+53 | 9.269685e+53 | 9.269685e+53 | 32.755489 | 158515.483019 | 2.106574e+46 | 9.269685e+53 | 9.270152e+47 | NaN |
In [14]:
# 数値カラムのキャスト
num_cols = [
"volume_usd",
"tvl_usd",
"fees_usd",
"open_price",
"high_price",
"low_price",
"close_price",
"liquidity",
"volume_token0",
"volume_token1",
"tx_count",
"tick",
"sqrt_price",
]
for c in num_cols:
raw_clean_with_pool[c] = pd.to_numeric(raw_clean_with_pool[c], errors="coerce")
異常値の検出 & クリーニング¶
In [15]:
# clip で下限 0、 or 上限 percentile 99.9% に制限
raw_clean_with_pool["tvl_usd"] = raw_clean_with_pool["tvl_usd"].clip(lower=0)
upper = raw_clean_with_pool["volume_usd"].quantile(0.999)
raw_clean_with_pool = raw_clean_with_pool[raw_clean_with_pool["volume_usd"] <= upper]
In [16]:
# 欠損の確認(型変換で NaN が入る可能性があるため)
print(raw_clean_with_pool[num_cols].isna().sum())
raw_clean_with_pool = raw_clean_with_pool.dropna(subset=num_cols)
volume_usd 0 tvl_usd 0 fees_usd 0 open_price 0 high_price 0 low_price 0 close_price 0 liquidity 0 volume_token0 0 volume_token1 0 tx_count 0 tick 2 sqrt_price 0 dtype: int64
可視化¶
時系列分析¶
In [17]:
# 時系列での取引量推移(全体)
daily_volume = (
raw_clean_with_pool.groupby(raw_clean_with_pool["datetime"].dt.date).agg({"volume_usd": "sum"}).reset_index()
)
plt.figure(figsize=(12, 6))
sns.lineplot(data=daily_volume, x="datetime", y="volume_usd", marker="o", markersize=4, linewidth=1.5)
plt.title("日次総取引量")
plt.xlabel("日付")
plt.ylabel("取引量 (USD)")
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m-%d"))
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# 主要プールの時系列分析
top_pools = raw_clean_with_pool.groupby("pool_address")["volume_usd"].sum().nlargest(5).index
for pool in top_pools:
dfp = raw_clean_with_pool[raw_clean_with_pool["pool_address"] == pool].copy()
# 日付で集計
dfp["date"] = dfp["datetime"].dt.date
dfp_agg = dfp.groupby("date")[["volume_usd", "tvl_usd"]].sum().reset_index()
# 二軸プロット作成
fig, ax1 = plt.subplots(figsize=(12, 6))
# 左軸:取引量
color1 = "tab:blue"
line1 = sns.lineplot(
data=dfp_agg, x="date", y="volume_usd", marker="o", markersize=4, linewidth=1.5, color=color1, ax=ax1
)
ax1.set_xlabel("日付")
ax1.set_ylabel("取引量 (USD)", color=color1)
ax1.tick_params(axis="y", labelcolor=color1)
# 右軸:TVL
ax2 = ax1.twinx()
color2 = "tab:red"
line2 = sns.lineplot(
data=dfp_agg, x="date", y="tvl_usd", marker="o", markersize=4, linewidth=1.5, color=color2, ax=ax2
)
ax2.set_ylabel("TVL (USD)", color=color2)
ax2.tick_params(axis="y", labelcolor=color2)
# グラフタイトルとフォーマット設定
plt.title(f"プール {pool[:10]}… の日次推移")
ax1.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m-%d"))
plt.xticks(rotation=45)
# 凡例追加
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, ["日次取引量", "日次TVL"], loc="upper left")
# プロット調整と表示
sns.despine(left=False, right=False)
plt.tight_layout()
plt.show()
# オプション:すべてのトッププールを1つのグラフで比較
plt.figure(figsize=(14, 8))
pool_data = []
for pool in top_pools:
dfp = raw_clean_with_pool[raw_clean_with_pool["pool_address"] == pool].copy()
dfp["date"] = dfp["datetime"].dt.date
dfp_agg = dfp.groupby("date")["volume_usd"].sum().reset_index()
dfp_agg["pool"] = pool[:8] + "..." # アドレスの先頭8文字
pool_data.append(dfp_agg)
all_pools_data = pd.concat(pool_data)
sns.lineplot(data=all_pools_data, x="date", y="volume_usd", hue="pool", marker="o", markersize=4)
plt.title("トッププール5件の日次取引量比較")
plt.xlabel("日付")
plt.ylabel("取引量 (USD)")
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m-%d"))
plt.xticks(rotation=45)
plt.legend(title="Pool")
plt.tight_layout()
plt.show()
In [18]:
# 主要プールの時系列分析
for pool in top_pools:
dfp = raw_clean_with_pool[raw_clean_with_pool["pool_address"] == pool].copy()
# datetime を index にして日次リサンプリング
dfp = dfp.set_index("datetime").sort_index()
dfp_agg = dfp[["volume_usd", "tvl_usd"]].resample("D").sum().reset_index()
# 二軸プロット作成
fig, ax1 = plt.subplots(figsize=(12, 6))
# 左軸:取引量
color1 = "tab:blue"
line1 = sns.lineplot(
data=dfp_agg, x="datetime", y="volume_usd", marker="o", markersize=4, linewidth=1.5, color=color1, ax=ax1
)
ax1.set_xlabel("日付")
ax1.set_ylabel("取引量 (USD)", color=color1)
ax1.tick_params(axis="y", labelcolor=color1)
# 右軸:TVL
ax2 = ax1.twinx()
color2 = "tab:red"
line2 = sns.lineplot(
data=dfp_agg, x="datetime", y="tvl_usd", marker="o", markersize=4, linewidth=1.5, color=color2, ax=ax2
)
ax2.set_ylabel("TVL (USD)", color=color2)
ax2.tick_params(axis="y", labelcolor=color2)
# グラフタイトルとフォーマット設定
plt.title(f"プール {pool[:10]}… の日次推移 (resample)")
ax1.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m-%d"))
plt.xticks(rotation=45)
# 凡例追加
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, ["日次取引量", "日次TVL"], loc="upper left")
# プロット調整と表示
sns.despine(left=False, right=False)
plt.tight_layout()
plt.show()
# 追加オプション:すべてのトッププールを1つのグラフで比較(resampleバージョン)
plt.figure(figsize=(14, 8))
pool_data = []
for pool in top_pools:
dfp = raw_clean_with_pool[raw_clean_with_pool["pool_address"] == pool].copy()
# resampleを使用
dfp = dfp.set_index("datetime").sort_index()
dfp_agg = dfp[["volume_usd"]].resample("D").sum().reset_index()
dfp_agg["pool"] = pool[:8] + "..." # アドレスの先頭8文字
pool_data.append(dfp_agg)
all_pools_data = pd.concat(pool_data)
sns.lineplot(data=all_pools_data, x="datetime", y="volume_usd", hue="pool", marker="o", markersize=4)
plt.title("トッププール5件の日次取引量比較 (resample)")
plt.xlabel("日付")
plt.ylabel("取引量 (USD)")
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m-%d"))
plt.xticks(rotation=45)
plt.legend(title="Pool")
plt.tight_layout()
plt.show()
In [ ]:
特徴量エンジニアリング¶
In [20]:
from matplotlib.cm import ScalarMappable
from matplotlib.colors import Normalize
# プールごとに時系列特徴量を作成(この部分は同じ)
def add_time_features(group):
# 時系列順にソート
group = group.sort_values("datetime")
# 移動平均(24時間)
group["volume_ma24"] = group["volume_usd"].rolling(24).mean()
# 移動平均からの乖離率(異常検知の重要指標)
group["volume_deviation"] = (group["volume_usd"] / (group["volume_ma24"] + 1e-10)) - 1
# ボラティリティ(24時間の標準偏差)
group["volume_volatility"] = group["volume_usd"].rolling(24).std()
# TVLに対する取引量の比率(高すぎると異常の可能性)
group["volume_tvl_ratio"] = group["volume_usd"] / (group["tvl_usd"] + 1e-10)
# 価格変動率
group["price_change"] = (group["close_price"] / group["open_price"]) - 1
return group
# 各プールに対して特徴を追加
enhanced_df = raw_clean_with_pool.groupby("pool_address").apply(add_time_features)
# 外れ値を制限(可視化のため)
upper_tvl_ratio = np.percentile(enhanced_df["volume_tvl_ratio"].dropna(), 99)
upper_deviation = np.percentile(enhanced_df["volume_deviation"].dropna(), 99)
lower_deviation = np.percentile(enhanced_df["volume_deviation"].dropna(), 1)
plot_df = enhanced_df.copy()
plot_df["volume_tvl_ratio"] = plot_df["volume_tvl_ratio"].clip(upper=upper_tvl_ratio)
plot_df["volume_deviation"] = plot_df["volume_deviation"].clip(upper=upper_deviation, lower=lower_deviation)
# 異常スコアの可視化
# カラーマップの設定
cmap = plt.cm.coolwarm
norm = Normalize(vmin=plot_df["price_change"].quantile(0.05), vmax=plot_df["price_change"].quantile(0.95))
# 散布図プロット
fig, ax = plt.subplots(figsize=(12, 8))
scatter = sns.scatterplot(
data=plot_df,
x="volume_tvl_ratio",
y="volume_deviation",
hue="price_change",
palette=cmap,
hue_norm=norm,
alpha=0.7,
edgecolor="none",
s=50, # ポイントサイズ
ax=ax, # Axesを明示的に指定
)
# カラーバーの追加
sm = ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
cbar = plt.colorbar(sm, ax=ax)
cbar.set_label("価格変動率")
# グラフの設定
plt.title("取引量の異常指標", fontsize=16)
plt.xlabel("取引量/TVL比率")
plt.ylabel("移動平均からの乖離率")
# 凡例の非表示(カラーバーがあるため)
scatter.get_legend().remove()
# トゥールチップ代わりにポイントに注釈を付ける場合のサンプル(上位5件の異常値)
anomaly_score = plot_df["volume_deviation"].abs() * plot_df["volume_tvl_ratio"]
top_anomalies = plot_df.loc[anomaly_score.nlargest(5).index]
for idx, row in top_anomalies.iterrows():
plt.annotate(
f"Pool: {row['pool_address'][:8]}...\nDate: {row['datetime'].strftime('%Y-%m-%d %H:%M')}",
xy=(row["volume_tvl_ratio"], row["volume_deviation"]),
xytext=(10, 10),
textcoords="offset points",
bbox=dict(boxstyle="round,pad=0.5", fc="yellow", alpha=0.5),
arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=0"),
)
plt.tight_layout()
plt.show()
# オプション:異常検知の時系列分析(移動平均からの乖離率の時系列プロット)
# トップ5の異常値を持つプールを選択
top_anomaly_pools = top_anomalies["pool_address"].unique()[:3] # 上位3つのプールを選択
plt.figure(figsize=(14, 8))
for pool in top_anomaly_pools:
pool_data = enhanced_df[enhanced_df["pool_address"] == pool].copy()
pool_data = pool_data.sort_values("datetime")
sns.lineplot(data=pool_data, x="datetime", y="volume_deviation", label=f"Pool {pool[:8]}...")
plt.axhline(y=0, color="r", linestyle="--", alpha=0.3)
plt.title("移動平均からの乖離率の時系列推移(異常検知上位プール)")
plt.xlabel("日時")
plt.ylabel("移動平均からの乖離率")
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
/tmp/ipykernel_72670/1444843618.py:29: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
enhanced_df = raw_clean_with_pool.groupby("pool_address").apply(add_time_features)
/usr/local/lib/python3.11/site-packages/matplotlib/colors.py:2295: RuntimeWarning: invalid value encountered in divide
resdat /= (vmax - vmin)
多変量相関分析とプール特性の把握¶
In [21]:
# 1. 相関ヒートマップ
numeric_cols = ["volume_usd", "tvl_usd", "fees_usd", "tx_count", "liquidity", "volume_deviation", "volume_tvl_ratio"]
corr = enhanced_df[numeric_cols].corr()
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(corr, dtype=bool)) # 上三角形をマスク(オプション)
heatmap = sns.heatmap(
corr,
annot=True, # 値を表示
fmt=".2f", # 小数点以下2桁表示
cmap="coolwarm", # カラーマップ
vmin=-1,
vmax=1, # 値の範囲
center=0, # 中央値(カラーマップの中心)
square=True, # 正方形のセル
linewidths=0.5, # セル間の線の幅
cbar_kws={"shrink": 0.8}, # カラーバーの設定
)
plt.title("指標間の相関係数", fontsize=16, pad=20)
plt.tight_layout()
plt.show()
# 2. フィーティア別の統計比較
def anomaly_rate(x):
return (abs(x) > 3).mean()
fee_tier_stats = enhanced_df.groupby("fee_tier").agg(
{
"volume_usd": ["mean", "std", "max"],
"tvl_usd": ["mean", "std"],
"tx_count": "mean",
"volume_deviation": ["mean", "std", anomaly_rate],
}
)
# カラム名をフラット化
fee_tier_stats.columns = [
"_".join(col).strip() if isinstance(col, tuple) else col for col in fee_tier_stats.columns.values
]
fee_tier_stats = fee_tier_stats.reset_index()
# フィーティア別の異常発生率比較
plt.figure(figsize=(10, 6))
bar = sns.barplot(data=fee_tier_stats, x="fee_tier", y="volume_deviation_anomaly_rate", palette="viridis")
# 値を棒グラフの上に表示
for i, p in enumerate(bar.patches):
height = p.get_height()
bar.text(p.get_x() + p.get_width() / 2.0, height + 0.01, f"{height:.2%}", ha="center", fontsize=10)
plt.title("フィーティア別の異常発生率(volume_deviation > 3σ)", fontsize=14)
plt.xlabel("フィーティア")
plt.ylabel("異常発生率")
plt.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.show()
# 3. トークンペアの組み合わせ分析
token_pairs = enhanced_df.groupby(["token0_symbol", "token1_symbol"]).size().reset_index(name="count")
top_pairs = token_pairs.sort_values("count", ascending=False).head(15)
# 表示用のラベルにトークンペアを結合
top_pairs["pair_label"] = top_pairs["token0_symbol"] + "-" + top_pairs["token1_symbol"]
plt.figure(figsize=(12, 10))
bars = sns.barplot(
data=top_pairs,
y="pair_label", # 結合したラベルを使用
x="count",
palette="muted",
orient="h", # 水平バーチャート
)
# 値を棒グラフの中に表示
for i, p in enumerate(bars.patches):
width = p.get_width()
bars.text(
width * 0.98,
p.get_y() + p.get_height() / 2.0,
f"{int(width)}",
ha="right",
va="center",
color="white",
fontweight="bold",
fontsize=10,
)
plt.title("最も一般的なトークンペア", fontsize=14)
plt.xlabel("件数")
plt.ylabel("") # yラベルは不要
plt.tight_layout()
plt.show()
# ボーナス:フィーティア別の各種統計量のヒートマップ表示
# 主要な統計量のみを選択
key_stats = ["volume_usd_mean", "tvl_usd_mean", "tx_count_mean", "volume_deviation_anomaly_rate"]
stats_df = fee_tier_stats[["fee_tier"] + key_stats].set_index("fee_tier")
# フォーマットとラベル設定
formatted_labels = {
"volume_usd_mean": "平均取引量",
"tvl_usd_mean": "平均TVL",
"tx_count_mean": "平均取引回数",
"volume_deviation_anomaly_rate": "異常発生率",
}
# 正規化(各指標を0-1スケールに)
normalized_stats = stats_df.copy()
for col in key_stats:
normalized_stats[col] = (stats_df[col] - stats_df[col].min()) / (stats_df[col].max() - stats_df[col].min())
plt.figure(figsize=(10, 8))
sns.heatmap(
normalized_stats,
annot=stats_df, # 元の値を表示
fmt=".2g", # 一般形式で表示
cmap="YlGnBu",
linewidths=0.5,
yticklabels=normalized_stats.index,
xticklabels=[formatted_labels[col] for col in key_stats],
)
plt.title("フィーティア別の主要統計指標", fontsize=14)
plt.tight_layout()
plt.show()
/tmp/ipykernel_72670/4107957614.py:46: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. bar = sns.barplot(data=fee_tier_stats, x="fee_tier", y="volume_deviation_anomaly_rate", palette="viridis")
/tmp/ipykernel_72670/4107957614.py:68: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. bars = sns.barplot(
In [ ]:
In [ ]:
In [ ]:
ベースラインモデルの定義¶
In [22]:
# 異常検知のための特徴選択
anomaly_features = ["volume_usd", "tvl_usd", "tx_count", "volume_deviation", "volume_tvl_ratio", "price_change"]
# 欠損値処理
model_df = enhanced_df.dropna(subset=anomaly_features)
# インデックスをリセット(重要: インデックスの問題を解決するため)
model_df = model_df.reset_index(drop=True)
# スケーリング
scaler = StandardScaler()
scaled_features = scaler.fit_transform(model_df[anomaly_features])
# Isolation Forestで異常検知
clf = IsolationForest(contamination=0.01, random_state=42)
model_df["anomaly_score"] = clf.fit_predict(scaled_features)
model_df["is_anomaly"] = model_df["anomaly_score"] == -1
# 検出された異常のみのデータフレーム
anomaly_df = model_df[model_df["is_anomaly"]].copy()
# プール名をより短く、読みやすくする
anomaly_df["pool_short"] = anomaly_df["pool_address"].apply(lambda x: x[:8] + "...")
# サイズ変数の調整(視覚的に見やすくするため)
if len(anomaly_df) > 0:
size_var = np.abs(anomaly_df["volume_deviation"])
if size_var.max() > size_var.min():
size_var = 50 + (size_var - size_var.min()) / (size_var.max() - size_var.min()) * 200
else:
size_var = 100 # 全て同じ値の場合のデフォルトサイズ
else:
size_var = [] # 空の場合
# 1. 検出された異常の可視化
if len(anomaly_df) > 0:
plt.figure(figsize=(14, 8))
# 色分け用のパレット設定(プール数に応じて色を生成)
unique_pools = anomaly_df["pool_short"].nunique()
palette = sns.color_palette("husl", unique_pools)
# 散布図プロット
scatter = sns.scatterplot(
data=anomaly_df,
x="datetime",
y="volume_usd",
hue="pool_short",
size=size_var,
sizes=(50, 250), # サイズの範囲
palette=palette,
alpha=0.7,
edgecolor="black",
linewidth=0.5,
)
# グラフの設定
plt.title("検出された異常(Isolation Forest)", fontsize=16)
plt.xlabel("日時")
plt.ylabel("取引量 (USD)")
# x軸の日付フォーマット調整
plt.gcf().autofmt_xdate()
# 凡例調整(多すぎる場合は表示を制限)
if unique_pools > 10:
# 凡例を上位10件に制限
handles, labels = scatter.get_legend_handles_labels()
plt.legend(handles[:11], labels[:11], title="Pool", fontsize=9, loc="best")
else:
plt.legend(title="Pool", fontsize=9, loc="best")
# 注釈追加(トップ5の異常値)
if len(anomaly_df) >= 5:
top_anomalies = anomaly_df.nlargest(5, "volume_usd")
else:
top_anomalies = anomaly_df
for idx, row in top_anomalies.iterrows():
plt.annotate(
f"{row['token0_symbol']}-{row['token1_symbol']}",
xy=(row["datetime"], row["volume_usd"]),
xytext=(10, 0),
textcoords="offset points",
fontsize=9,
bbox=dict(boxstyle="round,pad=0.3", fc="yellow", alpha=0.5),
)
plt.tight_layout()
plt.show()
else:
print("異常は検出されませんでした。")
# 2. プール別の異常数集計
# as_index=Falseを指定して明示的にカラムとして処理
anomaly_by_pool = model_df.groupby("pool_address", as_index=False)["is_anomaly"].sum()
anomaly_by_pool = anomaly_by_pool[anomaly_by_pool["is_anomaly"] > 0].sort_values("is_anomaly", ascending=False)
if len(anomaly_by_pool) > 0:
top10_pools = anomaly_by_pool.head(10).copy()
# プールアドレスを短縮
top10_pools["pool_short"] = top10_pools["pool_address"].apply(lambda x: x[:10] + "...")
plt.figure(figsize=(12, 7))
bars = sns.barplot(
data=top10_pools,
x="is_anomaly",
y="pool_short",
palette="viridis",
orient="h", # 水平棒グラフ
)
# バーに数値を表示
for i, p in enumerate(bars.patches):
width = p.get_width()
plt.text(
width + 0.3, # 少し右にオフセット
p.get_y() + p.get_height() / 2,
f"{int(width)}",
ha="left",
va="center",
)
plt.title("異常が最も多く検出されたプールTop10", fontsize=14)
plt.xlabel("異常データ数")
plt.ylabel("プールアドレス")
plt.tight_layout()
plt.show()
else:
print("異常は検出されませんでした。")
# 3. ボーナス:トークンペア別の異常発生数
if len(anomaly_df) > 0:
# トークンペア情報を結合
anomaly_df["token_pair"] = anomaly_df["token0_symbol"] + "-" + anomaly_df["token1_symbol"]
# トークンペア別の異常数を集計
anomaly_by_pair = anomaly_df.groupby("token_pair").size().reset_index(name="count")
top_pairs = anomaly_by_pair.sort_values("count", ascending=False).head(10)
plt.figure(figsize=(12, 7))
bars = sns.barplot(data=top_pairs, x="count", y="token_pair", palette="muted", orient="h")
# バーに数値を表示
for i, p in enumerate(bars.patches):
width = p.get_width()
plt.text(width + 0.1, p.get_y() + p.get_height() / 2, f"{int(width)}", ha="left", va="center")
plt.title("異常が最も多く検出されたトークンペアTop10", fontsize=14)
plt.xlabel("異常データ数")
plt.ylabel("トークンペア")
plt.tight_layout()
plt.show()
/tmp/ipykernel_72670/1810225964.py:106: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. bars = sns.barplot(
/tmp/ipykernel_72670/1810225964.py:143: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. bars = sns.barplot(data=top_pairs, x="count", y="token_pair", palette="muted", orient="h")
In [ ]:
In [ ]:
In [ ]: